library(RedditExtractoR)
help("find_subreddits")
help("find_thread_urls")
threads_contents <- get_thread_content(reddit_deepfake_turkey$url)
str(threads_contents$threads)
upvotes_reddit<-data.frame(threads_contents[["threads"]][["upvotes"]])
reddit_comment<-data.frame(threads_contents[["comments"]][["comment"]])
reddit_comment_upvote<-data.frame(threads_contents[["comments"]][["upvotes"]])
reddit_comment_date<-data.frame(threads_contents[["comments"]][["date"]])
reddit_general_comment<-cbind(reddit_comment_date,reddit_comment,reddit_comment_upvote)
names(reddit_general_comment)[1] <- c("Date")
names(reddit_general_comment)[2] <- c("Text")
names(reddit_general_comment)[3] <- c("Upvote")
reddit_upvote_date<-data.frame(threads_contents[["threads"]][["date"]])
reddit_upvote_title<-data.frame(threads_contents[["threads"]][["title"]])
reddit_upvote<-data.frame(threads_contents[["threads"]][["upvotes"]])
reddit_second_deepfake<-cbind(reddit_upvote_date,reddit_upvote_title,reddit_upvote)
names(reddit_second_deepfake)[1] <- c("Date")
names(reddit_second_deepfake)[2] <- c("Text")
names(reddit_second_deepfake)[3] <- c("Upvote")
reddit_upvote_title<-data.frame(threads_contents[["threads"]][["text"]])
reddit_deepfake_whole<-rbind(reddit_first_deepfake,reddit_second_deepfake,reddit_general_comment)
number_upvote<-data.frame(table(reddit_deepfake_whole$Upvote))
#########
write.csv(reddit_deepfake_whole,"reddit_deepfake_whole.csv")
#################
reddit_deepfake<- find_subreddits(c("deepfake"))
top_cats_urls_1 <- find_thread_urls(keywords = "deepfakes",period = "all")
top_cats_urls_4 <- find_thread_urls(keywords = "deepfake",subreddit="ankara",
                                     period = "all")
peter_text <- Corpus(VectorSource(new$text))
peter_text_clean <- tm_map(peter_text, removePunctuation)
peter_text_clean <- tm_map(peter_text_clean, content_transformer(tolower))
peter_text_clean <- tm_map(peter_text_clean, removeNumbers)
peter_text_clean <- tm_map(peter_text_clean, stripWhitespace)
peter_text_clean <- tm_map(peter_text_clean, removeWords, stopwords('english'))
peter_pan_df <- data_frame(Text = last_df$Text)
peter_words <- peter_pan_df %>% 
  unnest_tokens(output = word, input = Text)
peter_words <- peter_words %>%
  anti_join(stop_words)
peter_wordcounts <- peter_words %>% count(word, sort = TRUE)

head(peter_wordcounts)
peter_wordcounts %>% 
  filter(n > 200) %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n)) + 
  geom_col() +
  coord_flip() +
  labs(x = "Word \n", y = "\n Count ", title = "Frequent Words In Peter Pan \n") +
  geom_text(aes(label = n), hjust = 1.2, colour = "white", fontface = "bold") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="darkblue", size = 12),
        axis.title.y = element_text(face="bold", colour="darkblue", size = 12))
#######most used words
library(openxlsx)
write.xlsx(peter_wordcounts,"most_used_words_reddit.xlsx")
#############
reddit_deepfake_turkey$tarih<-format(reddit_deepfake_turkey$date_utc,format= "%m/%Y")
library(dplyr)
reddit_year<-reddit_deepfake_turkey%>%count(tarih)
######
table(reddit_deepfake_turkey$subreddit)
title_reddit<-data.frame(reddit_deepfake_turkey$title)
text_reddit<-data.frame(na.omit(reddit_deepfake_turkey$text))
names(title_reddit) <- c("Text")
names(text_reddit) <- c("Text")
new_df_1<-rbind(text_reddit,title_reddit)
last_df<-rbind(new_df_1,genel_comments)
##################text analysis
library(dplyr)
library(tidytext)
library(ggplot2)
peter_words <- svp_corpus_new %>% 
  unnest_tokens(output = word, input = text) 
peter_wordcounts <- isis1 %>% count(word, sort = TRUE)
peter_wordcounts %>% 
  filter(n > 100) %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n)) + 
  geom_col() +
  coord_flip() +
  labs(x = "Word \n", y = "\n Count ", title = "Frequent Words In Tweets") +
  geom_text(aes(label = n), hjust = 1.2, colour = "white", fontface = "bold") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="darkblue", size = 12),
        axis.title.y = element_text(face="bold", colour="darkblue", size = 12))
ahme<-ggplot(most_used_word, aes(x=Word, y=n)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle=50, hjust=1))
ahme +labs(y= "Number", x = "Word")+ggtitle("Most Used 15 Words")+ theme(axis.text.x = element_text(angle=45, hjust=1,size = 6))

#######URL
library(stringr)

url_pattern <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"

genel_comments$ContentURL <- str_extract(genel_comments$Text, url_pattern)

url_reddit<-data.frame(na.omit(genel_comments$ContentURL))
library(openxlsx)
write.xlsx(url_reddit,"reddit_url_turkey.xlsx")
###########categorization
porn_dictionary<-c("ifşa","porno","nude","nsfw","göt","meme","sexs","sex","sikiş",
                   "sikis","sekse","seks","sansürsüz","secret","gizli","pornstar",
                   "penis","otuzbir","milf","ünlü","masturbasyon","31","cinsel",
                   "dildo","porn","sakso","cima","ç1plak","penis","sansürsüz","if_a")
library(stringr)
porn_reddit<-reddit_deepfake_turkey[str_detect(reddit_deepfake_turkey$text, porn_dictionary),]
library(openxlsx)
write.xlsx(porn_reddit,"porn_icerik_reddit.xlsx")
celebrity_dictionary<-c("hadise","scarlett","şeyma","barbara","elish","müge",
                        "anlı","tugay","palvin","aleyna","tilki","subaşı","şeyma",
                        "duygu","özaslan","berna","gal","gadot","johansson","özberk",
                        "pqueen","acun","angelina","jolie","anli","ezgi","johanasson")
celebrity_reddit<-genel_comments[str_detect(genel_comments$Text, celebrity_dictionary),]
write.xlsx(celebrity_reddit,"celebrity_reddit.xlsx")
politic_dictionary<-c("kemal","kılıçdaroğlu","kk","rte","erdo","tayyip","recep","erdoğan",
                      "bahçeli","devlet","apo","k1l1çdaro","chp","akp","mhp","seçim",
                      "muhalefet","ittifak","cumhur","millet")
politics_reddit_1<-genel_comments[str_detect(genel_comments$Text, politic_dictionary),]
write.xlsx(politics_reddit,"political_reddit.xlsx")
technical_dictionary<-c("program","yardım","nasıl","1080","gereksinim","PC","sistem")
technical_reddit<-genel_comments[str_detect(genel_comments$Text, technical_dictionary),]
write.xlsx(technical_reddit,"technical_reddit.xlsx")
anime_dic<-c("anime","manga","hentai","kenway","pqueen")
anime_comments<-reddit_deepfake_turkey[str_detect(reddit_deepfake_turkey$title, paste(anime_dic, collapse="|")),]
anime_reddit_last<-genel_comments[str_detect(genel_comments$Text, paste(anime_dic, collapse="|")),]
anime_reddit_last_2<-reddit_deepfake_whole[str_detect(reddit_deepfake_whole$Text, paste(anime_dic, collapse="|")),]
###############new and last dictionary
celebrity_reddit_comments<-reddit_deepfake_turkey[str_detect(reddit_deepfake_turkey$title, paste(celebrity_dictionary, collapse="|")),]
celebrity_reddit_comments_1<-reddit_deepfake_turkey[str_detect(reddit_deepfake_turkey$text, paste(celebrity_dictionary, collapse="|")),]
celebrity_reddit_last<-genel_comments[str_detect(genel_comments$Text, paste(celebrity_dictionary, collapse="|")),]
rm(celebrity_reddit_comments,celebrity_reddit_comments_1,celebrity_reddit_last)
politics_reddit_comments<-reddit_deepfake_turkey[str_detect(reddit_deepfake_turkey$title, paste(politic_dictionary, collapse="|")),]
politics_reddit_comments_1<-reddit_deepfake_turkey[str_detect(reddit_deepfake_turkey$text, paste(politic_dictionary, collapse="|")),]
politics_reddit_last<-genel_comments[str_detect(genel_comments$Text, paste(politic_dictionary, collapse="|")),]
rm(politics_reddit_comments,politics_reddit_comments_1,politics_reddit_last)
sexual_reddit_comments<-reddit_deepfake_turkey[str_detect(reddit_deepfake_turkey$title, paste(porn_dictionary, collapse="|")),]
sexual_reddit_comments_1<-reddit_deepfake_turkey[str_detect(reddit_deepfake_turkey$text, paste(porn_dictionary, collapse="|")),]
sexual_reddit_last<-genel_comments[str_detect(genel_comments$Text, paste(porn_dictionary, collapse="|")),]
rm(sexual_reddit_comments,sexual_reddit_comments_1,sexual_reddit_last)
technical_comments<-reddit_deepfake_turkey[str_detect(reddit_deepfake_turkey$title, paste(technical_dictionary, collapse="|")),]
technical_comments_1<-reddit_deepfake_turkey[str_detect(reddit_deepfake_turkey$text, paste(technical_dictionary, collapse="|")),]
technical_reddit_last<-genel_comments[str_detect(genel_comments$Text, paste(technical_dictionary, collapse="|")),]
rm(list = ls())
####
library(quanteda)
library(LSX)
# identify context words
eco <- char_context(toks_1, pattern = "kemal", p = 0.05)#####inceleyeceğimiz verinin tokens olması gerekmektedir. Buna dikkat edilmesi gerekiyor.
######eco dökümanı bize askeriye ile ilgili olan kelimelerin listesini buldu.
#####dictionary_based
library(tidyverse)
library(quanteda)
library(readtext)
library(arm)
library(sp)
library(rgdal)
isis_tweet_az<-gsub("[A-Za-z]{1,5}[.][A-Za-z]{2,3}/[A-Za-z0-9]+\\b", "",Dimensions_Publication_2023_07_18_16_14_21$Title)
isis_tweet_az<-gsub(pattern = "[[:digit:]]",replace=" ",isis_tweet_az)
isis_tweet_az<-gsub(pattern = "[[:punct:]]",replace=" ",isis_tweet_az)
isis_tweet_az<-gsub("(s?)(f|ht)tp(s?)://\\S+\\b", "", isis_tweet_az)
isis_tweet_az<-gsub("@[[:alnum:]_]{4,}", "", isis_tweet_az)
isis_tweet_az<-gsub("^RT:? ","", isis_tweet_az)
isis_tweet_az<-gsub("[^\x01-\x7F]", "",isis_tweet_az)
library(tm)
isis_tweet<-stripWhitespace(genel_comments$Text)
isis1<-tolower(isis_tweet)
toks_1<-tokens(isis1)
stpo_w<-stopwords("tr")
toks_3<-tokens_remove(toks_1,ref_StopWordListTR$StopWord)
toks_3<-tokens_remove(toks_3,stopwords_tur)
svp_corpus_new <- corpus(as.character(isis1))
stopwords_tur<-readLines("stopwords_turkish.txt",warn=FALSE)
religion<-readLines("religion_dic_turkish.txt",warn=FALSE)
svp_dict <- dictionary(list(
  sexual= porn_dictionary,
  celebrity= celebrity_dictionary,
  politics= politic_dictionary,
  technical= technical_dictionary
))
head(dfm(svp_corpus_new, dictionary = svp_dict))
svp_dfm_dict <- dfm(svp_corpus_new, dictionary = svp_dict)
prop.table(topfeatures(svp_dfm_dict,20))*100
pie(prop.table(topfeatures(svp_dfm_dict,20)),labels = c("Recruitment/Justification=68.12%","Terrorism/Violence= 31.87%"),col = rainbow(6))

#################
library(dplyr)
library(tidytext)
tidy_trump_tweets<- genel_comments %>%
  select(Text) %>%
  unnest_tokens("word", title)

tidy_trump_tfidf<- reddit_deepfake_turkey %>%
  select(date_utc,title) %>%
  unnest_tokens("word", title) %>%
  count(word, date_utc) %>%
  bind_tf_idf(word, date_utc, n)

top_tfidf<-tidy_trump_tfidf %>%
  arrange(desc(tf_idf))
###Now let’s see what the most unusual word is "nude".
top_tfidf$word[1]
